{
  "nbformat": 4,
  "nbformat_minor": 0,
  "metadata": {
    "colab": {
      "name": "Hindi Build Word2Vec.ipynb",
      "provenance": [],
      "collapsed_sections": [
        "qx0PzL8X-puW"
      ]
    },
    "kernelspec": {
      "name": "python3",
      "display_name": "Python 3"
    }
  },
  "cells": [
    {
      "cell_type": "markdown",
      "metadata": {
        "id": "hLatfJVA-N_4",
        "colab_type": "text"
      },
      "source": [
        "# Build Word2Vec"
      ]
    },
    {
      "cell_type": "markdown",
      "metadata": {
        "id": "qx0PzL8X-puW",
        "colab_type": "text"
      },
      "source": [
        "## Helpers"
      ]
    },
    {
      "cell_type": "code",
      "metadata": {
        "id": "jJm5JiCx-5I9",
        "colab_type": "code",
        "outputId": "07cf89ca-1804-437d-ec75-326fc7b83d3b",
        "colab": {
          "base_uri": "https://localhost:8080/",
          "height": 128
        }
      },
      "source": [
        "from google.colab import drive\n",
        "drive.mount('/content/drive')"
      ],
      "execution_count": 0,
      "outputs": [
        {
          "output_type": "stream",
          "text": [
            "Go to this URL in a browser: https://accounts.google.com/o/oauth2/auth?client_id=947318989803-6bn6qk8qdgf4n4g3pfee6491hc0brc4i.apps.googleusercontent.com&redirect_uri=urn%3aietf%3awg%3aoauth%3a2.0%3aoob&response_type=code&scope=email%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdocs.test%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdrive%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdrive.photos.readonly%20https%3a%2f%2fwww.googleapis.com%2fauth%2fpeopleapi.readonly\n",
            "\n",
            "Enter your authorization code:\n",
            "··········\n",
            "Mounted at /content/drive\n"
          ],
          "name": "stdout"
        }
      ]
    },
    {
      "cell_type": "code",
      "metadata": {
        "id": "3R6rhK-KAiR8",
        "colab_type": "code",
        "colab": {}
      },
      "source": [
        "from __future__ import print_function\n",
        "\n",
        "\n",
        "import json\n",
        "import os\n",
        "import pandas\n",
        "import io\n",
        "import sys\n",
        "import re\n",
        "\n",
        "\n",
        "class ProgressBar(object):\n",
        "    DEFAULT = 'Progress: %(bar)s %(percent)3d%%'\n",
        "    FULL = '%(bar)s %(current)d/%(total)d (%(percent)3d%%) %(remaining)d to go'\n",
        "\n",
        "    def __init__(self, total, width=40, fmt=DEFAULT, symbol='=',\n",
        "                 output=sys.stderr):\n",
        "        assert len(symbol) == 1\n",
        "\n",
        "        self.total = total\n",
        "        self.width = width\n",
        "        self.symbol = symbol\n",
        "        self.output = output\n",
        "        self.fmt = re.sub(r'(?P<name>%\\(.+?\\))d',\n",
        "            r'\\g<name>%dd' % len(str(total)), fmt)\n",
        "\n",
        "        self.current = 0\n",
        "\n",
        "    def __call__(self):\n",
        "        percent = self.current / float(self.total)\n",
        "        size = int(self.width * percent)\n",
        "        remaining = self.total - self.current\n",
        "        bar = '[' + self.symbol * size + ' ' * (self.width - size) + ']'\n",
        "\n",
        "        args = {\n",
        "            'total': self.total,\n",
        "            'bar': bar,\n",
        "            'current': self.current,\n",
        "            'percent': percent * 100,\n",
        "            'remaining': remaining\n",
        "        }\n",
        "        print('\\r' + self.fmt % args, file=self.output, end='')\n",
        "\n",
        "    def done(self):\n",
        "        self.current = self.total\n",
        "        self()\n",
        "        print('', file=self.output)"
      ],
      "execution_count": 0,
      "outputs": []
    },
    {
      "cell_type": "code",
      "metadata": {
        "id": "NTc4-5PNBXe9",
        "colab_type": "code",
        "colab": {}
      },
      "source": [
        "default_path = \"drive/My Drive/Hindi_News/\"\n",
        "\n",
        "#-------------------------------save/load--------------------------------------#\n",
        "pickle_path = default_path + \"pickles/\"\n",
        "\n",
        "import pickle\n",
        "\n",
        "def save(obj , filename):\n",
        "  print(\"saving {} ..\".format(filename))\n",
        "  with open(filename, 'wb') as handle:\n",
        "      pickle.dump(obj, handle, protocol=pickle.HIGHEST_PROTOCOL)\n",
        "      \n",
        "def load(filename):\n",
        "  print(\"loading {} ..\".format(filename))\n",
        "  with open(filename, 'rb') as handle:\n",
        "    return pickle.load(handle)\n",
        "#-----------------------------------------------------------------------------------#  "
      ],
      "execution_count": 0,
      "outputs": []
    },
    {
      "cell_type": "markdown",
      "metadata": {
        "id": "1JZpLkHD-sRW",
        "colab_type": "text"
      },
      "source": [
        "## Word Embedding"
      ]
    },
    {
      "cell_type": "code",
      "metadata": {
        "id": "s5G76-weSjcI",
        "colab_type": "code",
        "outputId": "e56ebe32-2117-4d26-a812-f5fd29b3fd83",
        "colab": {
          "base_uri": "https://localhost:8080/",
          "height": 54
        }
      },
      "source": [
        "!unzip \"drive/My Drive/Hindi_News/HindiNewsBook.zip\" -d  \"drive/My Drive/Hindi_News/\""
      ],
      "execution_count": 0,
      "outputs": [
        {
          "output_type": "stream",
          "text": [
            "Archive:  drive/My Drive/Hindi_News/HindiNewsBook.zip\n",
            "  inflating: drive/My Drive/Hindi_News/HindiNewsBook.csv  \n"
          ],
          "name": "stdout"
        }
      ]
    },
    {
      "cell_type": "code",
      "metadata": {
        "id": "gNkpXi6XJuPr",
        "colab_type": "code",
        "colab": {}
      },
      "source": [
        "default_path = \"drive/My Drive/Hindi_News/\"\n",
        "reviews_csv = default_path + \"HindiNewsBook.csv\""
      ],
      "execution_count": 0,
      "outputs": []
    },
    {
      "cell_type": "code",
      "metadata": {
        "id": "uBRWCPUMgn2B",
        "colab_type": "code",
        "outputId": "eee66a74-be77-417c-f689-9248493e7eb1",
        "colab": {
          "base_uri": "https://localhost:8080/",
          "height": 198
        }
      },
      "source": [
        "import pandas as pd\n",
        "reviews = pd.read_csv(reviews_csv)\n",
        "reviews.head()"
      ],
      "execution_count": 0,
      "outputs": [
        {
          "output_type": "execute_result",
          "data": {
            "text/html": [
              "<div>\n",
              "<style scoped>\n",
              "    .dataframe tbody tr th:only-of-type {\n",
              "        vertical-align: middle;\n",
              "    }\n",
              "\n",
              "    .dataframe tbody tr th {\n",
              "        vertical-align: top;\n",
              "    }\n",
              "\n",
              "    .dataframe thead th {\n",
              "        text-align: right;\n",
              "    }\n",
              "</style>\n",
              "<table border=\"1\" class=\"dataframe\">\n",
              "  <thead>\n",
              "    <tr style=\"text-align: right;\">\n",
              "      <th></th>\n",
              "      <th>सेtext</th>\n",
              "      <th>title</th>\n",
              "    </tr>\n",
              "  </thead>\n",
              "  <tbody>\n",
              "    <tr>\n",
              "      <th>0</th>\n",
              "      <td>पाकिस्तान से खबरें संगीन रंगीन हैं। पापा जरदार...</td>\n",
              "      <td>बिलावल-हिना का खूबसूरत घोटाला</td>\n",
              "    </tr>\n",
              "    <tr>\n",
              "      <th>1</th>\n",
              "      <td>इन 5 बातों का हर भाई रखे ख्याल, नहीं तो रिश्तो...</td>\n",
              "      <td>Read useful articles about relationships from ...</td>\n",
              "    </tr>\n",
              "    <tr>\n",
              "      <th>2</th>\n",
              "      <td>छत्तीसगढ़ में इंसानियत को शर्मसार कर देने वाली...</td>\n",
              "      <td>जमीनी विवाद में भाभी और भतीजी को उतारा मौत के घाट</td>\n",
              "    </tr>\n",
              "    <tr>\n",
              "      <th>3</th>\n",
              "      <td>एनसीसी की गर्ल कैडेट को पोर्न वीडियो भेजने पर ...</td>\n",
              "      <td>NCC गर्ल कैडेट को पोर्न क्लिप भेजता था मेजर जन...</td>\n",
              "    </tr>\n",
              "    <tr>\n",
              "      <th>4</th>\n",
              "      <td>देश, दुनिया, खेल, बिजनेस और बॉलीवुड में क्‍या ...</td>\n",
              "      <td>Breaking News:एक क्लिक में पढ़ें गुरुवार दिनभर...</td>\n",
              "    </tr>\n",
              "  </tbody>\n",
              "</table>\n",
              "</div>"
            ],
            "text/plain": [
              "                                              सेtext                                              title\n",
              "0  पाकिस्तान से खबरें संगीन रंगीन हैं। पापा जरदार...                      बिलावल-हिना का खूबसूरत घोटाला\n",
              "1  इन 5 बातों का हर भाई रखे ख्याल, नहीं तो रिश्तो...  Read useful articles about relationships from ...\n",
              "2  छत्तीसगढ़ में इंसानियत को शर्मसार कर देने वाली...  जमीनी विवाद में भाभी और भतीजी को उतारा मौत के घाट\n",
              "3  एनसीसी की गर्ल कैडेट को पोर्न वीडियो भेजने पर ...  NCC गर्ल कैडेट को पोर्न क्लिप भेजता था मेजर जन...\n",
              "4  देश, दुनिया, खेल, बिजनेस और बॉलीवुड में क्‍या ...  Breaking News:एक क्लिक में पढ़ें गुरुवार दिनभर..."
            ]
          },
          "metadata": {
            "tags": []
          },
          "execution_count": 5
        }
      ]
    },
    {
      "cell_type": "code",
      "metadata": {
        "id": "NzzAeH2SAOxQ",
        "colab_type": "code",
        "outputId": "d8b3ee71-cf01-44f9-b1c8-9c56a3bb4f1f",
        "colab": {
          "base_uri": "https://localhost:8080/",
          "height": 54
        }
      },
      "source": [
        "documents = reviews[\"सेtext\"] \n",
        "summary = reviews[\"title\"]\n",
        "print(len(documents))\n",
        "print(len(summary))"
      ],
      "execution_count": 0,
      "outputs": [
        {
          "output_type": "stream",
          "text": [
            "60347\n",
            "60347\n"
          ],
          "name": "stdout"
        }
      ]
    },
    {
      "cell_type": "code",
      "metadata": {
        "id": "EwjFKBnIgwFx",
        "colab_type": "code",
        "outputId": "cc270ea9-cff3-42ad-b17d-bbe8641f4008",
        "colab": {
          "base_uri": "https://localhost:8080/",
          "height": 126
        }
      },
      "source": [
        "clean_documents = []\n",
        "print(\"loading documents...\")\n",
        "progress = ProgressBar(len(documents), fmt=ProgressBar.FULL)\n",
        "for doc in documents:\n",
        "  clean_documents.append(str(doc).split())\n",
        "  progress.current += 1\n",
        "  progress()\n",
        "progress.done()\n",
        "save(clean_documents , \"clean_documents_ketab.pkl\")\n",
        "\n",
        "\n",
        "clean_summary = []\n",
        "print(\"loading summaries...\")\n",
        "progress = ProgressBar(len(summary), fmt=ProgressBar.FULL)\n",
        "for doc in summary:\n",
        "  clean_summary.append(str(doc).split())\n",
        "  progress.current += 1\n",
        "  progress()\n",
        "progress.done()\n",
        "save(clean_summary , \"clean_summary_ketab.pkl\")"
      ],
      "execution_count": 0,
      "outputs": [
        {
          "output_type": "stream",
          "text": [
            "loading documents...\n"
          ],
          "name": "stdout"
        },
        {
          "output_type": "stream",
          "text": [
            "[========================================] 60347/60347 (100%)     0 to go\n"
          ],
          "name": "stderr"
        },
        {
          "output_type": "stream",
          "text": [
            "saving clean_documents_ketab.pkl ..\n",
            "loading summaries...\n"
          ],
          "name": "stdout"
        },
        {
          "output_type": "stream",
          "text": [
            "[========================================] 60347/60347 (100%)     0 to go\n"
          ],
          "name": "stderr"
        },
        {
          "output_type": "stream",
          "text": [
            "saving clean_summary_ketab.pkl ..\n"
          ],
          "name": "stdout"
        }
      ]
    },
    {
      "cell_type": "code",
      "metadata": {
        "id": "Nca0sB7vqpb-",
        "colab_type": "code",
        "outputId": "e0994047-d255-4a84-deec-08a6ef8bc855",
        "colab": {
          "base_uri": "https://localhost:8080/",
          "height": 55
        }
      },
      "source": [
        "print(clean_documents[0])"
      ],
      "execution_count": 0,
      "outputs": [
        {
          "output_type": "stream",
          "text": [
            "['पाकिस्तान', 'से', 'खबरें', 'संगीन', 'रंगीन', 'हैं।', 'पापा', 'जरदारी', 'तमाम', 'तरह', 'के', 'घोटालों', 'में', 'उलझे', 'हुए', 'हैं।', 'बेटे', 'बिलावल', 'जरदारी', 'उम्र', 'में', 'अपने', 'से', 'दस', 'साल', 'बड़ी', 'पाकिस्तानी', 'विदेश', 'मंत्री', 'हिना', 'रब्बानी', 'के', 'खूबसूरत', 'बालों', 'में', 'उलझे', 'हुए', 'हैं।', 'यह', 'भी', 'एक', 'घोटाला', 'है।', 'इस', 'तरह', 'के', 'घोटाले', 'होते', 'रहें,', 'तो', 'अखबार', 'पढ़ना,', 'टीवी', 'न्यूज', 'चैनल', 'देखना', 'इंटरेस्टिंग', 'हो', 'जाता', 'है', 'जी।', 'बिलावल', 'के', 'साथ', 'हिना', 'के', 'फोटू', 'छपें', 'रोज', 'अखबार', 'में,', 'तो', 'अखबार', 'की', 'विजुअल', 'अपील', 'बढ़', 'जाती', 'है।', 'इनके', 'फोटू', 'देखकर', 'ये', 'खबर', 'पढ़ने', 'की', 'हिम्मत', 'आ', 'जाती', 'है', 'कि', 'भारतीय', 'मंत्रियों', 'ने', 'विदेशी', 'यात्राओं', 'में', 'तय', 'रकम', 'से', '12', 'गुना', 'ज्यादा', 'खर्च', 'किया।', 'ये', 'सच्ची', 'खबर', 'परेशान', 'करती', 'है।', 'हिना', 'और', 'बिलावल', 'के', 'बारे', 'में', 'गॉसिप', 'ही', 'सही,', 'फोटोजेनिक', 'राहत', 'देती', 'है।', 'हमारे', 'यहां', 'इतने', 'खूबसूरत', 'घोटाले', 'नहीं', 'होते।', 'घोटाला', 'क्या', 'हुआ-कोयला', 'घोटाला।', 'रोज', 'रोज', 'कोयला', 'खान', 'के', 'फोटू', 'देखो', 'और', 'बोर', 'होते', 'रहो।', 'एक', 'जमाने', 'में', 'लालूजी', 'का', 'चारा', 'घोटाला', 'पॉप्युलर', 'था।', 'लालूजी', 'का', 'फोटू', 'रोज', 'छपता', 'था', 'एक', 'अखबार', 'में,', 'उससे', 'बोर', 'होकर', 'अखबार', 'ने', 'नया', 'फोटू', 'छापा', 'एक', 'भैंस', 'का-नीचे', 'कैप्शन', 'लगाया', 'गया,', 'इसका', 'चारा', 'खाया', 'गया।', 'लो', 'जी', 'लालूजी', 'के', 'फोटू', 'से', 'छूटे,', 'तो', 'भैंस', 'का', 'फोटू', 'देखो।', 'खूबसूरत', 'घोटाला', 'चाहिए', 'जी,', 'खूबसूरत', 'घोटाला।', 'मेरा', 'निवेदन', 'है', 'कि', 'हिनाजी', 'और', 'बिलावलजी', 'का', 'मामला', 'और', 'आगे', 'जाना', 'चाहिए।', 'दोनों', 'की', 'शादी', 'हो', 'जाए', 'और', 'फिर', 'इनमें', 'से', 'कोई', 'भी', 'प्राइम', 'मिनिस्टर', 'बन', 'जाए', 'तो', 'बिलावलजी', 'को', 'सीनियर', 'हिना', 'डांट', 'डपट', 'तो', 'सकती', 'हैं', 'कि', 'देखो', 'इतना', 'खुलेआम', 'अपने', 'पापा', 'की', 'तरह', 'दस', 'परसेंट', 'ना', 'खाया', 'करो।', 'और', 'मान', 'लो', 'बिलावलजी', 'और', 'हिनाजी', 'संयुक्त', 'रूप', 'से', 'खाने', 'लगे', 'तो', 'भी', 'तो', 'दोनों', 'के', 'फोटोजेनिक', 'खूबसूरत', 'फोटो', 'ही', 'छपेंगे', 'अखबार', 'में', 'खूबसूरत', 'घोटाला।', 'पापा', 'जरदारी', 'बेटे', 'जरदारी', 'के', 'इस', 'अफेयर', 'के', 'खिलाफ', 'हैं,', 'पापाजी', 'के', 'बारे', 'में', 'कहा', 'जाता', 'है', 'कि', 'वह', 'किसी', 'भी', 'समझदार', 'और', 'काबिल', 'बंदे', 'की', 'मौजूदगी', 'में', 'खुद', 'को', 'असहज', 'महसूस', 'करते', 'हैं।', 'हिनाजी', 'परमानेंटली', 'फैमिली', 'में', 'ही', 'आ', 'गयीं', 'तो', 'परेशानी', 'हो', 'जाएगी।', 'पर', 'जो', 'भी', 'हो,', 'हम', 'सबको', 'और', 'खूबसूरत', 'घोटालों', 'का', 'स्वागत', 'और', 'प्रतीक्षा', 'करनी', 'चाहिए।', 'इंडियन', 'नवोदित', 'नेताओं', 'सबक', 'लो,', 'बिलावल', 'जी', 'से।']\n"
          ],
          "name": "stdout"
        }
      ]
    },
    {
      "cell_type": "code",
      "metadata": {
        "id": "hw0GxP6MUE8Z",
        "colab_type": "code",
        "outputId": "28d114b5-e355-4e64-eefd-99ea6979575a",
        "colab": {
          "base_uri": "https://localhost:8080/",
          "height": 35
        }
      },
      "source": [
        "print(clean_summary[0])"
      ],
      "execution_count": 0,
      "outputs": [
        {
          "output_type": "stream",
          "text": [
            "['बिलावल-हिना', 'का', 'खूबसूरत', 'घोटाला']\n"
          ],
          "name": "stdout"
        }
      ]
    },
    {
      "cell_type": "code",
      "metadata": {
        "id": "oc-84NYQG0k7",
        "colab_type": "code",
        "outputId": "0560c335-495c-48af-9d86-f2a2ee34f870",
        "colab": {
          "base_uri": "https://localhost:8080/",
          "height": 35
        }
      },
      "source": [
        "default_path"
      ],
      "execution_count": 0,
      "outputs": [
        {
          "output_type": "execute_result",
          "data": {
            "text/plain": [
              "'drive/My Drive/Hindi_News/'"
            ]
          },
          "metadata": {
            "tags": []
          },
          "execution_count": 25
        }
      ]
    },
    {
      "cell_type": "code",
      "metadata": {
        "id": "ypIsldP4JURZ",
        "colab_type": "code",
        "colab": {}
      },
      "source": [
        "clean_documents_list = clean_documents + clean_summary"
      ],
      "execution_count": 0,
      "outputs": []
    },
    {
      "cell_type": "code",
      "metadata": {
        "id": "JWp1MNwQJa2A",
        "colab_type": "code",
        "outputId": "ad39e8ab-2e16-4f24-9efe-e9f4b940bea6",
        "colab": {
          "base_uri": "https://localhost:8080/",
          "height": 35
        }
      },
      "source": [
        "len(clean_documents_list)"
      ],
      "execution_count": 0,
      "outputs": [
        {
          "output_type": "execute_result",
          "data": {
            "text/plain": [
              "120694"
            ]
          },
          "metadata": {
            "tags": []
          },
          "execution_count": 28
        }
      ]
    },
    {
      "cell_type": "code",
      "metadata": {
        "id": "E_y_U_4DUWd3",
        "colab_type": "code",
        "outputId": "da6b59ab-3623-48ab-c3c2-fa6cde73150d",
        "colab": {
          "base_uri": "https://localhost:8080/",
          "height": 35
        }
      },
      "source": [
        "len(clean_documents_list[0][10])"
      ],
      "execution_count": 0,
      "outputs": [
        {
          "output_type": "execute_result",
          "data": {
            "text/plain": [
              "2"
            ]
          },
          "metadata": {
            "tags": []
          },
          "execution_count": 29
        }
      ]
    },
    {
      "cell_type": "code",
      "metadata": {
        "id": "fym9XC-2kV-g",
        "colab_type": "code",
        "outputId": "7c56ef29-3528-4d45-abab-a158d751a0a3",
        "colab": {
          "base_uri": "https://localhost:8080/",
          "height": 74
        }
      },
      "source": [
        "import gensim\n",
        "model_arabic_vec = gensim.models.Word2Vec(\n",
        "        clean_documents_list,\n",
        "        size=150,\n",
        "        window=10,\n",
        "        min_count=2,\n",
        "        workers=10)\n",
        "model_arabic_vec.train(clean_documents_list, total_examples=len(clean_documents_list), epochs=10)\n",
        "model_arabic_vec.wv.save(default_path +\"model_hindi.model\")"
      ],
      "execution_count": 0,
      "outputs": [
        {
          "output_type": "stream",
          "text": [
            "/usr/local/lib/python3.6/dist-packages/smart_open/smart_open_lib.py:402: UserWarning: This function is deprecated, use smart_open.open instead. See the migration notes for details: https://github.com/RaRe-Technologies/smart_open/blob/master/README.rst#migrating-to-the-new-open-function\n",
            "  'See the migration notes for details: %s' % _MIGRATION_NOTES_URL\n"
          ],
          "name": "stderr"
        }
      ]
    },
    {
      "cell_type": "code",
      "metadata": {
        "id": "-pjDW1uJTAgj",
        "colab_type": "code",
        "outputId": "80dadb23-c1ef-40ec-c935-23d55848c12a",
        "colab": {
          "base_uri": "https://localhost:8080/",
          "height": 35
        }
      },
      "source": [
        "default_path"
      ],
      "execution_count": 0,
      "outputs": [
        {
          "output_type": "execute_result",
          "data": {
            "text/plain": [
              "'drive/Colab Notebooks/Model 4_5/'"
            ]
          },
          "metadata": {
            "tags": []
          },
          "execution_count": 19
        }
      ]
    },
    {
      "cell_type": "code",
      "metadata": {
        "id": "xTo28z6RS-B8",
        "colab_type": "code",
        "outputId": "e61efd2e-ea44-4521-b7c3-6310f9620981",
        "colab": {
          "base_uri": "https://localhost:8080/",
          "height": 74
        }
      },
      "source": [
        "model_arabic_vec.wv.save('drive/My Drive/Colab Notebooks/Model 4_5/' +\"model_arabic_extreme.model\")"
      ],
      "execution_count": 0,
      "outputs": [
        {
          "output_type": "stream",
          "text": [
            "/usr/local/lib/python3.6/dist-packages/smart_open/smart_open_lib.py:398: UserWarning: This function is deprecated, use smart_open.open instead. See the migration notes for details: https://github.com/RaRe-Technologies/smart_open/blob/master/README.rst#migrating-to-the-new-open-function\n",
            "  'See the migration notes for details: %s' % _MIGRATION_NOTES_URL\n"
          ],
          "name": "stderr"
        }
      ]
    },
    {
      "cell_type": "code",
      "metadata": {
        "id": "8o2BHZPa7asP",
        "colab_type": "code",
        "outputId": "536d94ad-015e-453a-eca8-b126fbca1b5b",
        "colab": {
          "base_uri": "https://localhost:8080/",
          "height": 74
        }
      },
      "source": [
        "from gensim.models import KeyedVectors\n",
        "wv = KeyedVectors.load(default_path +\"model_hindi.model\", mmap='r')"
      ],
      "execution_count": 0,
      "outputs": [
        {
          "output_type": "stream",
          "text": [
            "/usr/local/lib/python3.6/dist-packages/smart_open/smart_open_lib.py:402: UserWarning: This function is deprecated, use smart_open.open instead. See the migration notes for details: https://github.com/RaRe-Technologies/smart_open/blob/master/README.rst#migrating-to-the-new-open-function\n",
            "  'See the migration notes for details: %s' % _MIGRATION_NOTES_URL\n"
          ],
          "name": "stderr"
        }
      ]
    },
    {
      "cell_type": "code",
      "metadata": {
        "id": "PtdMqy6O7qFf",
        "colab_type": "code",
        "outputId": "11598ce4-6cdf-47b7-f49b-503bd879634c",
        "colab": {
          "base_uri": "https://localhost:8080/",
          "height": 255
        }
      },
      "source": [
        "wv.most_similar(positive = \"से\")"
      ],
      "execution_count": 0,
      "outputs": [
        {
          "output_type": "stream",
          "text": [
            "/usr/local/lib/python3.6/dist-packages/gensim/matutils.py:737: FutureWarning: Conversion of the second argument of issubdtype from `int` to `np.signedinteger` is deprecated. In future, it will be treated as `np.int64 == np.dtype(int).type`.\n",
            "  if np.issubdtype(vec.dtype, np.int):\n"
          ],
          "name": "stderr"
        },
        {
          "output_type": "execute_result",
          "data": {
            "text/plain": [
              "[('उससे', 0.6536398530006409),\n",
              " ('इससे', 0.6046265363693237),\n",
              " ('इनसे', 0.5620524287223816),\n",
              " ('में', 0.5374647378921509),\n",
              " ('उनसे', 0.5367596745491028),\n",
              " ('authorised', 0.4722856879234314),\n",
              " ('blend', 0.4571074843406677),\n",
              " ('की', 0.44924241304397583),\n",
              " ('calculations', 0.44774317741394043),\n",
              " ('को', 0.4473797380924225)]"
            ]
          },
          "metadata": {
            "tags": []
          },
          "execution_count": 5
        }
      ]
    },
    {
      "cell_type": "code",
      "metadata": {
        "id": "D9PoAiSU7vmu",
        "colab_type": "code",
        "outputId": "a98244d5-4e1b-4d41-e8ef-58af6d48f17e",
        "colab": {
          "base_uri": "https://localhost:8080/",
          "height": 255
        }
      },
      "source": [
        "wv.most_similar(positive = \"पाकिस्तान\")"
      ],
      "execution_count": 0,
      "outputs": [
        {
          "output_type": "stream",
          "text": [
            "/usr/local/lib/python3.6/dist-packages/gensim/matutils.py:737: FutureWarning: Conversion of the second argument of issubdtype from `int` to `np.signedinteger` is deprecated. In future, it will be treated as `np.int64 == np.dtype(int).type`.\n",
            "  if np.issubdtype(vec.dtype, np.int):\n"
          ],
          "name": "stderr"
        },
        {
          "output_type": "execute_result",
          "data": {
            "text/plain": [
              "[('पाक', 0.8744723200798035),\n",
              " ('पाकिस्\\u200dतान', 0.7458062171936035),\n",
              " ('अफगानिस्तान', 0.7245218753814697),\n",
              " ('पाकिस्तानी', 0.7201468348503113),\n",
              " ('चीन', 0.6863623857498169),\n",
              " ('पीओके', 0.6857386827468872),\n",
              " ('तालिबान', 0.6807665824890137),\n",
              " ('PAK', 0.6637619137763977),\n",
              " ('बांग्लादेश', 0.647534966468811),\n",
              " ('भारत', 0.6450281739234924)]"
            ]
          },
          "metadata": {
            "tags": []
          },
          "execution_count": 6
        }
      ]
    },
    {
      "cell_type": "code",
      "metadata": {
        "id": "FFKw8y5t3tKI",
        "colab_type": "code",
        "outputId": "4e9518e7-969c-4c73-ed2c-fa537311a416",
        "colab": {
          "base_uri": "https://localhost:8080/",
          "height": 255
        }
      },
      "source": [
        "wv.most_similar(positive = \"रंगीन\")"
      ],
      "execution_count": 0,
      "outputs": [
        {
          "output_type": "stream",
          "text": [
            "/usr/local/lib/python3.6/dist-packages/gensim/matutils.py:737: FutureWarning: Conversion of the second argument of issubdtype from `int` to `np.signedinteger` is deprecated. In future, it will be treated as `np.int64 == np.dtype(int).type`.\n",
            "  if np.issubdtype(vec.dtype, np.int):\n"
          ],
          "name": "stderr"
        },
        {
          "output_type": "execute_result",
          "data": {
            "text/plain": [
              "[('जींस', 0.6590641140937805),\n",
              " ('रंग-बिरंगे', 0.6505199074745178),\n",
              " ('आकर्षक', 0.6378679275512695),\n",
              " ('कुर्ता', 0.6336793899536133),\n",
              " ('नीले', 0.6324629783630371),\n",
              " ('सफेद', 0.6312911510467529),\n",
              " ('रंग', 0.6301822066307068),\n",
              " ('पेंट', 0.625964879989624),\n",
              " ('रंग-बिरंगी', 0.6163167953491211),\n",
              " ('रंगों', 0.6155998706817627)]"
            ]
          },
          "metadata": {
            "tags": []
          },
          "execution_count": 7
        }
      ]
    },
    {
      "cell_type": "code",
      "metadata": {
        "id": "KEGeFjVw3_a9",
        "colab_type": "code",
        "colab": {}
      },
      "source": [
        ""
      ],
      "execution_count": 0,
      "outputs": []
    },
    {
      "cell_type": "markdown",
      "metadata": {
        "id": "DGHTnUrc5ewh",
        "colab_type": "text"
      },
      "source": [
        "## Build Vocab dict"
      ]
    },
    {
      "cell_type": "code",
      "metadata": {
        "id": "8X-6CiKq6iNC",
        "colab_type": "code",
        "colab": {}
      },
      "source": [
        "!pip install nltk"
      ],
      "execution_count": 0,
      "outputs": []
    },
    {
      "cell_type": "code",
      "metadata": {
        "id": "VwKL9MRs6p4r",
        "colab_type": "code",
        "colab": {}
      },
      "source": [
        "import nltk\n",
        "nltk.download('punkt')"
      ],
      "execution_count": 0,
      "outputs": []
    },
    {
      "cell_type": "code",
      "metadata": {
        "id": "YODVAZZu5gqV",
        "colab_type": "code",
        "colab": {}
      },
      "source": [
        "from nltk.tokenize import word_tokenize\n",
        "import collections\n",
        "\n",
        "def build_dict(train_article_list,VOCAB_SIZE):\n",
        "    vocab_counter = collections.Counter()\n",
        "\n",
        "    progress = ProgressBar(len(train_article_list ), fmt=ProgressBar.FULL)\n",
        "    for sentence in train_article_list :\n",
        "        words = list()\n",
        "        for word in word_tokenize(sentence):\n",
        "            words.append(word)\n",
        "        vocab_counter.update(words)\n",
        "        progress.current += 1\n",
        "        progress()\n",
        "    progress.done()\n",
        "    \n",
        "    print (\"Writing vocab file...\")\n",
        "    with open(os.path.join(pickle_path, \"vocab\"), 'w', encoding=\"utf-8\") as writer:\n",
        "      for word, count in vocab_counter.most_common(VOCAB_SIZE):\n",
        "        writer.write(word + ' ' + str(count) + '\\n')\n",
        "    print (\"Finished writing vocab file\")\n",
        "    return "
      ],
      "execution_count": 0,
      "outputs": []
    },
    {
      "cell_type": "code",
      "metadata": {
        "id": "m0xg1Q9A5mUI",
        "colab_type": "code",
        "colab": {}
      },
      "source": [
        "import pandas as pd\n",
        "default_path = \"drive/My Drive/Hindi_News/\"\n",
        "pickle_path = default_path + \"pickles/\"\n",
        "\n",
        "reviews = pd.read_csv(default_path + \"HindiNewsBook.csv\")\n",
        "reviews.shape\n",
        "reviews.isnull().sum()\n",
        "reviews = reviews.dropna()\n",
        "reviews = reviews.reset_index(drop=True)\n",
        "reviews.head()"
      ],
      "execution_count": 0,
      "outputs": []
    },
    {
      "cell_type": "code",
      "metadata": {
        "id": "ztGfnTCy5oZo",
        "colab_type": "code",
        "colab": {}
      },
      "source": [
        "Text = []\n",
        "Summary = []\n",
        "\n",
        "progress = ProgressBar(len(reviews), fmt=ProgressBar.FULL)\n",
        "\n",
        "for index , row in reviews.iterrows():\n",
        "  Text.append(row[\"सेtext\"])\n",
        "  Summary.append(row[\"title\"])\n",
        "  progress.current += 1\n",
        "  progress()\n",
        "progress.done()"
      ],
      "execution_count": 0,
      "outputs": []
    },
    {
      "cell_type": "code",
      "metadata": {
        "id": "FxESXPAd5qxV",
        "colab_type": "code",
        "colab": {}
      },
      "source": [
        "build_dict(reviews.सेtext,200000)"
      ],
      "execution_count": 0,
      "outputs": []
    }
  ]
}